notebook.community

Edit and run



In [1]:

    
from itertools import count
from scipy import sparse
from scipy.linalg import norm
from scipy.sparse.linalg import svds
import numpy as np
import timeit
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from math import sqrt
import matplotlib.pyplot as plt
%matplotlib inline
import time
import json



In [2]:

    
n = 300000



In [3]:

    
raw = np.genfromtxt('train_triplets.txt', max_rows=n, dtype="S40,S18,i8")



In [4]:

    
'''
Construct COOordinate matrix from raw
'''
row = []
col = []
data = []

user_to_row = {}
song_to_col = {}
user_count = count()
song_count = count()

for i, (uid, sid, play_count) in enumerate(raw):
    if not uid in user_to_row:
        user_to_row[uid] = next(user_count)
    row.append(user_to_row[uid])
    
    if not sid in song_to_col:
        song_to_col[sid] = next(song_count)
    col.append(song_to_col[sid])        
    
    data.append(float(play_count))

arr = np.array(data) # transform list to ndarray for conveninence



In [5]:

    
'''
Preprocess the data by binning the play counts into b bins 
and store data in sparse coo matrix
'''
b = 10
for i in range(b):
    arr[((arr >= ((1 << i))) & (arr <= ((1 << (i+1)) - 1)))] = (i+1)
arr[arr >= (1 << b)] = b

M = sparse.coo_matrix((arr, (row, col)))



In [6]:

    
'''
Helper methods for removing users/songs
'''
def removeCol(M):
    M = M.tocsc() #transform to CSC for fast column slicing
    cols = []
    for i in range(M.shape[1]):
        nnz = (M.getcol(i)).count_nonzero()
        if nnz > 5:
            cols.append(i)
    M = M[:, cols]
    return M

def removeRow(M):
    M = M.tocsr() #transform to CSR for fast row slicing
    rows = []
    for i in range(M.shape[0]):
        nnz = (M.getrow(i)).count_nonzero()
        if nnz > 5:
            rows.append(i)
    M = M[rows, :]
    return M



In [7]:

    
'''
Preprocess the data to avoid the cold start issue. 
That is, recursively remove all users/songs
that have less or equal than 5 occurrences in M, 
until M no longer changes.
'''
prevShape = M.shape
M = removeCol(M)
currShape = M.shape
while prevShape != currShape:
    prevShape = currShape
    M = removeCol(M)
    M = removeRow(M)
    currShape = M.shape



In [8]:

    
M = M.T # transpose M to have users to columns and songs to rows



In [9]:

    
'''
To properly evaluate our approach set-aside 200 randomly 
selected non-zero datapoints from the matrix as a test set.
That is, store their values separately and set them to 0 in M.
'''
M = M.tocsr()
nz_row_indices, nz_col_indices = M.nonzero()
nz_indices = np.vstack((nz_row_indices, nz_col_indices))
np.random.shuffle(nz_indices.T)
test_set_indices = nz_indices[:,:200]
M_test = M[test_set_indices[0,:],test_set_indices[1,:]]
M[test_set_indices[0,:],test_set_indices[1,:]] = 0 # set test instances to zero in M



In [10]:

    
'''
Additionally create a small validation set. That is, set-aside 1000 randomly selected
non-zero data-points and set them to ? in M. You can use this validation set for
hyper-parameter tuning and to judge the convergence of your methods.
'''
nz_row_indices, nz_col_indices = M.nonzero()
nz_indices = np.vstack((nz_row_indices, nz_col_indices))
np.random.shuffle(nz_indices.T)
val_set_indices = nz_indices[:,:1000]
M_val = M[val_set_indices[0,:], val_set_indices[1,:]]
M[val_set_indices[0,:], val_set_indices[1,:]] = 0



In [11]:

    
# Functions
class Timer:
    def __init__(self):
        self.timers = {}
        self.next_id = 0
    def start(self, msg=""):
        print("START %d-timer on: %s" % (self.next_id, msg))
        self.timers[self.next_id] = [msg, time.time()]
        self.next_id += 1
    def end(self):
        if self.next_id-1 in self.timers:
            end_time = time.time() - self.timers[self.next_id-1][1]
            print("END %d-timer on: %s in %.2f sec" % (
                self.next_id,
                self.timers[self.next_id-1][0],
                end_time))
            self.next_id -= 1
            return end_time
        else:
            print("NO TIMER started")
            return None


#The objective loss
def loss(M, Q, P, nnz_elem, opt):
    err = 0
    reg_x = 0
    reg_i = 0
    for idx in range(nnz_elem.shape[1]): # over all non-zero elements in M
        i = nnz_elem[0,idx]
        j = nnz_elem[1,idx]
        err += (M[i, j] - np.dot(Q[i,:], P[:, j]))**2
        reg_x += norm(P[:, j])**2
        reg_i += norm(Q[i,:])**2
    return err + opt['lambda_1']*reg_x + opt['lambda_2']*reg_i



def validation_loss(val_set_indices, M_val, Q, P, opt):
    err = 0
    reg_x = 0
    reg_i = 0
    for k in range(val_set_indices.shape[1]):
        i = val_set_indices[0,k]
        j = val_set_indices[1,k]
        err += (M_val[0, k] - np.dot(Q[i,:], P[:, j]))**2
        reg_x += norm(P[:, j])**2
        reg_i += norm(Q[i,:])**2
    return err + opt['lambda_1']*reg_x + opt['lambda_2']*reg_i


def test_error(test_set_indices, M_test, Q, P):
    err = 0
    for k in range(test_set_indices.shape[1]):
        i = test_set_indices[0,k]
        j = test_set_indices[1,k]
        err += (M_test[0, k] - np.dot(Q[i,:], P[:, j]))**2
    return sqrt(err*1.0/test_set_indices.shape[1])
    

# Standard Gradient Descent
def GD(M, Q, P, nnz_elem, val_set_indices, M_val, opt, timer=None):
    l=[]
    last_val_loss = np.inf
    last_loss = np.inf
    status = "Maximum steps reached"
    iteration_measured = False
    for step in range(opt['steps']):
        if not iteration_measured:
            timer.start("ITERATION %s" % json.dumps(opt,indent=1))
        delta_P = np.zeros_like(P)
        delta_Q = np.zeros_like(Q)
        for idx in range(nnz_elem.shape[1]):
            i = nnz_elem[0,idx]
            j = nnz_elem[1,idx]
            eij = M[i,j] - np.dot(Q[i,:], P[:,j])
            for k in range(opt['factors']):
                delta_P[k,j] += opt['learning_rate'] * 2 * (eij * Q[i,k] - opt['lambda_1'] * P[k,j])
                delta_Q[i,k] += opt['learning_rate'] * 2 * (eij * P[k,j] - opt['lambda_2'] * Q[i,k])
        P += delta_P
        Q += delta_Q
        if not iteration_measured:
            timer.end()
            iteration_measured = True
        
        if (step) % opt['epoch'] == 0:
            cur_loss = loss(M=M, Q=Q, P=P, nnz_elem=nnz_elem, opt=opt)
            l.append(cur_loss)
            val_loss = validation_loss(val_set_indices=val_set_indices, M_val=M_val, Q=Q, P=P, opt=opt)
            if val_loss > last_val_loss:
                status = "Validation loss increased"
                break
            elif abs(cur_loss - last_loss) < opt['stopping_eps']:
                status = "Training loss converged"
                break
            else:
                last_val_loss = val_loss
                last_loss = cur_loss
            print("After %d steps:\n\tTrainig Loss: %.4lf\n\tValidation Loss: %.4lf" % (step+1, l[-1], val_loss))
    plt.plot(np.arange(len(l)), l)
    return last_val_loss, status
  

'''
Get indices batch of size end - start
'''
def get_batch(nnz_elem, start, end):
    return nnz_elem[:,start:end]


'''
Get count of pairs in a set
Arguments: indices: ndarray
Returns: dict with number of occurences of each unique id found in indices
'''
def get_count(indices):
    unique, counts = np.unique(indices, return_counts=True)
    to_dict = dict(zip(unique, counts))
    return to_dict


def SGD(M, Q, P, nnz_elem, val_set_indices, M_val, opt, timer=None):
    l=[]
    last_val_loss = np.inf
    last_loss = np.inf
    start = 0
    end = 0
    status = "Maximum steps reached"
    iteration_measured = False
    np.random.shuffle(nnz_elem.T)
    for step in range(opt['steps']):
        if not iteration_measured:
            timer.start("ITERATION %s" % json.dumps(opt,indent=1))
        start = end # update bounds for current batch
        end += opt['batch_size']
        delta_P = np.zeros_like(P)
        delta_Q = np.zeros_like(Q)
        batch = get_batch(nnz_elem, start, end) # contains indices for current batch
        for idx in range(batch.shape[1]):
            i = batch[0,idx]
            j = batch[1,idx]
            eij = M[i,j] - np.dot(Q[i,:], P[:,j])
            for k in range(opt['factors']):
                delta_P[k,j] += (get_count(nnz_elem[1])[j]/get_count(batch[1])[j]) * opt['learning_rate'] * 2 * (eij * Q[i,k] - opt['lambda_1'] * P[k,j])
                delta_Q[i,k] += (get_count(nnz_elem[0])[i]/get_count(batch[0])[i]) * opt['learning_rate'] * 2 * (eij * P[k,j] - opt['lambda_2'] * Q[i,k])
        P += delta_P
        Q += delta_Q
        if not iteration_measured:
            timer.end()
            iteration_measured = True
        if (step) % opt['epoch'] == 0:
            cur_loss = loss(M=M, Q=Q, P=P, nnz_elem=nnz_elem, opt=opt)
            l.append(cur_loss)
            val_loss = validation_loss(val_set_indices=val_set_indices, M_val=M_val, Q=Q, P=P, opt=opt)
            if val_loss > last_val_loss:
                status = "Validation loss increased"
                break
            elif abs(cur_loss - last_loss) < opt['stopping_eps']:
                status = "Training loss converged"
                break
            else:
                last_val_loss = val_loss
                last_loss = cur_loss
            print("After %d steps:\n\tTrainig Loss: %.4lf\n\tValidation Loss: %.4lf" % (step+1, l[-1], val_loss))
    plt.plot(np.arange(len(l)), l)
    return last_val_loss, status

def set_optimizer_options(name, learning_rate, lambda_1, lambda_2, steps, epoch, factors, batch_size, stopping_eps):
    options = {
                'optimizer'     : name,
                'learning_rate' : learning_rate,
                'lambda_1'      : lambda_1,
                'lambda_2'      : lambda_2,
                'steps'         : steps,
                'epoch'         : epoch,
                'factors'       : factors,
                'batch_size'    : batch_size,
                'stopping_eps'  : stopping_eps
              }
    return options


def init(M,opt):
    U, S, Vt = svds(M,k=opt['factors'])
    Q = U*S
    P = Vt
    nz_i_indices, nz_x_indices = M.nonzero()
    nz_indices = np.vstack((nz_i_indices, nz_x_indices))
    return Q, P, nz_i_indices, nz_x_indices, nz_indices


'''
def train_hyperparameters(optimizer_func, M, Q, P, nnz_elem, opt, timer):
    timer.start("%s" % json.dumps(opt, indent=1))
    optimizer_func(M,Q,P,nnz_elem,opt)
    timer.end()
    pass
'''









    Out[11]:





'\ndef train_hyperparameters(optimizer_func, M, Q, P, nnz_elem, opt, timer):\n    timer.start("%s" % json.dumps(opt, indent=1))\n    optimizer_func(M,Q,P,nnz_elem,opt)\n    timer.end()\n    pass\n'



In [12]:

    
# Creater timer object
timer = Timer()

c) Learn the optimal P and Q using standard gradient descent. That is, during each iteration you have to use all of the training examples and update Q and P for all users and songs at once.

Approach

store deltas

$$ \Delta_u = \sum\limits_{song\;s\;of\;u} \frac{\partial \mathcal{L}}{\partial p_u}$$$$ \Delta_i = \sum\limits_{user\;u\;of\;i} \frac{\partial \mathcal{L}}{\partial p_i}$$

update feature vectors

$$ p_u = p_u - \Delta_u $$$$ p_i = p_i - \Delta_i $$



In [22]:

    
# Standard Gradient Descent
opt = set_optimizer_options(name='GradientDescent', 
                                learning_rate=0.0001, 
                                lambda_1=0.5, 
                                lambda_2=0.5, 
                                steps=5, 
                                epoch=1, 
                                factors=30,
                                batch_size=None,
                                stopping_eps=0.001)
Q, P, nz_i_indices, nz_x_indices, nz_indices = init(M=M,opt=opt)
val_loss, status = GD(M=M, Q=Q, P=P, nnz_elem=nz_indices, val_set_indices=val_set_indices, M_val=M_val, opt=opt, timer=timer)
err = test_error(test_set_indices=test_set_indices, M_test=M_test, Q=Q, P=P)
print("STATUS: %s\nVALIDATION LOSS: %.4lf\nTEST ERROR: %.10lf" % (status, val_loss, err))









    



START 0-timer on: ITERATION {
 "stopping_eps": 0.001,
 "lambda_2": 0.5,
 "lambda_1": 0.5,
 "batch_size": null,
 "optimizer": "GradientDescent",
 "epoch": 1,
 "factors": 30,
 "learning_rate": 0.0001,
 "steps": 5
}
END 1-timer on: ITERATION {
 "stopping_eps": 0.001,
 "lambda_2": 0.5,
 "lambda_1": 0.5,
 "batch_size": null,
 "optimizer": "GradientDescent",
 "epoch": 1,
 "factors": 30,
 "learning_rate": 0.0001,
 "steps": 5
} in 18.88 sec
After 1 steps:
	Trainig Loss: 10974032.3922
	Validation Loss: 48114.3729
After 2 steps:
	Trainig Loss: 10106190.8349
	Validation Loss: 44602.2570
After 3 steps:
	Trainig Loss: 9326964.6217
	Validation Loss: 41399.9068
After 4 steps:
	Trainig Loss: 8626695.7838
	Validation Loss: 38540.4846
After 5 steps:
	Trainig Loss: 7997024.5068
	Validation Loss: 35914.0402
STATUS: Maximum steps reached
VALIDATION LOSS: 35914.0402
TEST ERROR: 1.7775528850

(d) Learn the optimal P and Q using the original stochastic gradient descent (mini-batches of size 1). That is, during each iteration you sample a single random training example $r_{xi}$ and update only the respective affected parameters $p_x$ and $q_i$.



In [20]:

    
# Stochastic Gradient Descent
opt = set_optimizer_options(name='StochasticGradientDescent', 
                                learning_rate=0.00009, 
                                lambda_1=0.5, 
                                lambda_2=0.5, 
                                steps=500, 
                                epoch=10,
                                factors=30,
                                batch_size=1,
                                stopping_eps=0.001)
#Q, P, nz_i_indices, nz_x_indices, nz_indices = init(M,opt)
val_loss, status = SGD(M=M, Q=Q, P=P, nnz_elem=nz_indices, val_set_indices=val_set_indices, M_val=M_val, opt=opt, timer=timer)
err = test_error(test_set_indices=test_set_indices, M_test=M_test, Q=Q, P=P)
print("STATUS: %s\nVALIDATION LOSS: %.4lf\nTEST ERROR: %.10lf" % (status, val_loss, err))









    



START 0-timer on: ITERATION {
 "stopping_eps": 0.001,
 "lambda_2": 0.5,
 "lambda_1": 0.5,
 "batch_size": 1,
 "optimizer": "StochasticGradientDescent",
 "epoch": 10,
 "factors": 30,
 "learning_rate": 9e-05,
 "steps": 500
}
END 1-timer on: ITERATION {
 "stopping_eps": 0.001,
 "lambda_2": 0.5,
 "lambda_1": 0.5,
 "batch_size": 1,
 "optimizer": "StochasticGradientDescent",
 "epoch": 10,
 "factors": 30,
 "learning_rate": 9e-05,
 "steps": 500
} in 0.60 sec
After 1 steps:
	Trainig Loss: 28046515279438688.0000
	Validation Loss: 37453381698.8534
After 11 steps:
	Trainig Loss: 28046514727521240.0000
	Validation Loss: 37453381698.7228
After 21 steps:
	Trainig Loss: 28046514728213872.0000
	Validation Loss: 37453381698.6224
After 31 steps:
	Trainig Loss: 28046514728210452.0000
	Validation Loss: 37453381698.4204
After 41 steps:
	Trainig Loss: 28046514818387568.0000
	Validation Loss: 37453381586.5587
After 51 steps:
	Trainig Loss: 28046514665246272.0000
	Validation Loss: 37453381586.5476
STATUS: Validation loss increased
VALIDATION LOSS: 37453381586.5476
TEST ERROR: 1.9966334385

e) Learn the optimal P and Q similarly to d) this time using larger mini-batches of size $S$.



In [15]:

    
'''
Parameters to SGD with batch size greater than 1.
'''
opt = set_optimizer_options(name='StochasticGradientDescent', 
                                learning_rate=0.0003, 
                                lambda_1=0.5, 
                                lambda_2=0.5, 
                                steps=30, 
                                epoch=5,
                                factors=30,
                                batch_size=100,
                                stopping_eps=0.001)
#Q, P, nz_i_indices, nz_x_indices, nz_indices = init(M,opt)
val_loss, status = SGD(M=M, Q=Q, P=P, nnz_elem=nz_indices, val_set_indices=val_set_indices, M_val=M_val, opt=opt, timer=timer)
err = test_error(test_set_indices=test_set_indices, M_test=M_test, Q=Q, P=P)
print("STATUS: %s\nVALIDATION LOSS: %.4lf\nTEST ERROR: %.10lf" % (status, val_loss, err))









    



START 0-timer on: ITERATION {
 "stopping_eps": 0.001,
 "lambda_2": 0.5,
 "lambda_1": 0.5,
 "batch_size": 100,
 "optimizer": "StochasticGradientDescent",
 "epoch": 5,
 "factors": 30,
 "learning_rate": 0.0003,
 "steps": 30
}
END 1-timer on: ITERATION {
 "stopping_eps": 0.001,
 "lambda_2": 0.5,
 "lambda_1": 0.5,
 "batch_size": 100,
 "optimizer": "StochasticGradientDescent",
 "epoch": 5,
 "factors": 30,
 "learning_rate": 0.0003,
 "steps": 30
} in 58.69 sec
After 1 steps:
	Trainig Loss: 8393357298.7354
	Validation Loss: 1061097.3375
STATUS: Validation loss increased
VALIDATION LOSS: 1061097.3375
TEST ERROR: 1.9962536052

Report

Gradient Descent

SGD with batch size of 1

SGD with batch size greater than 1

Alternating Least Squares vs. Gradient Descent

Why using ALS ?

optimize over the entire loss function
transforms a non-convex problem into a convex one

Why (Stochastic) Gradient Descent ?

faster, because it optimizes only part of the optimization function.
not so good because it does not optimize the entire loss function, but only subset of it.
using normal gradient descent gets quite unfeasible on large datasets, if training on the entire dataset every time.



In [ ]: